Each row in MathAchieve represents each of the 7185 students within 160 schools. Each row in MathAchSchool represents each of 160 schools.
[http://www.stat.rutgers.edu/home/yhung/Stat586/Mixed%20model/appendix-mixed-models.pdf]
The variables that we are going to use: - School - SES - MathAch - Sector - MEANSES
data("MathAchieve")
summary(MathAchieve)
## School Minority Sex SES MathAch
## 2305 : 67 No :5211 Male :3390 Min. :-3.758000 Min. :-2.832
## 5619 : 66 Yes:1974 Female:3795 1st Qu.:-0.538000 1st Qu.: 7.275
## 4292 : 65 Median : 0.002000 Median :13.131
## 8857 : 64 Mean : 0.000143 Mean :12.748
## 4042 : 64 3rd Qu.: 0.602000 3rd Qu.:18.317
## 3610 : 64 Max. : 2.692000 Max. :24.993
## (Other):6795
## MEANSES
## Min. :-1.188000
## 1st Qu.:-0.317000
## Median : 0.038000
## Mean : 0.006138
## 3rd Qu.: 0.333000
## Max. : 0.831000
##
attach(MathAchieve)
mses <- tapply(SES, School, mean)
detach(MathAchieve)
data("MathAchSchool")
head(MathAchSchool)
## School Size Sector PRACAD DISCLIM HIMINTY MEANSES
## 1224 1224 842 Public 0.35 1.597 0 -0.428
## 1288 1288 1855 Public 0.27 0.174 0 0.128
## 1296 1296 1719 Public 0.32 -0.137 1 -0.420
## 1308 1308 716 Catholic 0.96 -0.622 0 0.534
## 1317 1317 455 Catholic 0.95 -1.694 1 0.351
## 1358 1358 1430 Public 0.25 1.535 0 -0.014
Since the MEANSES is different in MathAchieve and MathAchSchool, the new mean of SES for students in each school is generated.
Bryk <- as.data.frame(MathAchieve[,c("School", "SES", "MathAch")])
Bryk$meanses <- mses[as.character(Bryk$School)]
sector <- MathAchSchool$Sector
names(sector) <- row.names(MathAchSchool)
Bryk$Sector <- sector[as.character(Bryk$School)]
contrasts(Bryk$Sector)
## Catholic
## Public 0
## Catholic 1
ggplot(MathAchieve, aes(SES, MathAch)) + geom_point() + facet_wrap(~School)
fit <- lmer(MathAch ~ SES + Sector + (1|School), data = Bryk)
N <- nrow(Bryk)
school <- unique(as.character(Bryk$School))
nschool <- length(school)
y <- Bryk$MathAch
X <- model.matrix(~ SES + Sector, data = Bryk)
beta <- fixef(fit)
Z <- model.matrix(~ School - 1, data = Bryk)
u <- ranef(fit)$School[,1]
q <- length(u)
vc <- as.data.frame(VarCorr(fit))
R <- vc$vcov[vc$grp == "Residual"] * diag(N)
G <- vc$vcov[vc$grp == "School"] * diag(q)
Sigma <- Z %*% G %*% t(Z) + R
Sigma_inv <- solve(Sigma)
varMargRes <- Sigma - X %*% solve(t(X) %*% solve(Sigma) %*% X) %*% t(X)
P <- Sigma_inv - Sigma_inv %*% X %*% solve(t(X) %*% Sigma_inv %*% X) %*% t(X) %*% Sigma_inv
varCondRes <- R %*% P %*% R
varU <- G - G %*% t(Z) %*% P %*% Z %*% G
fit_bryk <- Bryk %>%
mutate(fitted = as.vector(X %*% beta),
resm = MathAch - fitted,
resc = MathAch - fitted - as.vector(Z %*% u),
index = 1:n()) %>%
rowwise() %>%
mutate(resm_std = resm / sqrt(diag(varMargRes)[index]),
resc_std = resc / sqrt(diag(varCondRes)[index])) %>%
ungroup()
Lesaffre and Verbeke comment that when the within-unit covariance structure is adequate, \(V_i = ||I_{m_i} - \varepsilon_i \varepsilon^\top||^2\), where \(\varepsilon_i = \hat \Omega ^{-1/2} \hat \xi\) should be close to zero. However, Singer consider replacing \(\varepsilon_i\) in \(V_i\) with the standardised marginal residuals \(\hat \xi_i^* = [\hat V(\hat \xi_i)]^{-1/2} \hat \xi_i\), where \(V(\hat \xi_i)\) refers to the diagonal block of \(\Omega - X(X^\top \Omega^{-1} X)^{-1} X^\top\) associated to the \(i\) unit.
unit_df <- expand_grid(school) %>%
# cannot follow Vi quit well
mutate(Vi = map_dbl(school, ~{
ind <- Bryk %>%
mutate(index = 1:n()) %>%
filter(School == .x) %>%
pull(index)
mi <- length(ind)
# standardised marginal residual
Ei <- solve(expm::sqrtm(varMargRes[ind, ind])) %*% fit_bryk$resm[ind]
sqrt(sum((diag(mi) - Ei %*% t(Ei))^2)) / mi
}),
Mi = as.vector(t(u) %*% solve(varU) %*% u)
)
ggplot(fit_bryk, aes(fitted, resm_std)) +
geom_hline(yintercept = 0) +
geom_point()
ggplot(fit_bryk, aes(index, resm_std)) + geom_point() + geom_hline(yintercept = 0)
ggplot(unit_df, aes(1:nrow(unit_df), Vi)) + geom_point()+ xlab("Unit indicies") + ylab("Modified Lesaffre-Verbeke index")
ggplot(fit_bryk, aes(index, resc_std)) + geom_point() + geom_hline(yintercept = 0)
ggplot(fit_bryk, aes(fitted, resc_std)) + geom_point() + geom_hline(yintercept = 0)
ggplot(fit_bryk, aes(sample = resc_std)) +
geom_qq() + geom_qq_line(color = "red")
ggplot(unit_df, aes(1:nrow(unit_df), Mi)) + geom_point() + xlab("Unit index") + ylab("Manhalanobis distance")
ggplot(unit_df, aes(sample = Mi)) +
geom_qq(distribution = stats::qchisq,
dparams = list(df = q)) +
geom_qq_line(color = "red",
distribution = stats::qchisq,
dparams = list(df = q))
fit.sim <- simulate(fit, nsim = 19, seed = 1234)
fit.refit <- lapply(fit.sim, refit, object = fit)
fit.simy <- lapply(fit.refit, function(x) getME(x, "y"))
fit.sim.y <- do.call("cbind", fit.simy)
fit.sim.y <- reshape2::melt(fit.sim.y)[-1]
names(fit.sim.y) <- c(".n", "y")
fit.sim.y$.n <- as.numeric(str_extract(fit.sim.y$.n, "\\d+"))
fit.sim.y$School <- rep(Bryk$School, 19)
fit.sim.y$SES <- rep(Bryk$SES, 19)
fit.sim.y$Sector <- rep(Bryk$Sector, 19)
ggplot(MathAchieve, aes(Sex, MathAch)) +
geom_beeswarm(alpha = .2) +
facet_grid(. ~Minority)
length(unique(MathAchieve$School))
## [1] 160
160 schools with the MEANSES is the nummeric vector of the mean SES for the school.
df <- MathAchieve %>% mutate(Sex = ifelse(Sex == "Male", 0, 1), Minority = ifelse(Minority == "No", 0, 1))
str(df)
## Classes 'nfnGroupedData', 'nfGroupedData', 'groupedData' and 'data.frame': 7185 obs. of 6 variables:
## $ School : Ord.factor w/ 160 levels "8367"<"8854"<..: 59 59 59 59 59 59 59 59 59 59 ...
## $ Minority: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sex : num 1 1 0 0 0 0 1 0 1 0 ...
## $ SES : num -1.528 -0.588 -0.528 -0.668 -0.158 ...
## $ MathAch : num 5.88 19.71 20.35 8.78 17.9 ...
## $ MEANSES : num -0.428 -0.428 -0.428 -0.428 -0.428 -0.428 -0.428 -0.428 -0.428 -0.428 ...
## - attr(*, "formula")=Class 'formula' language MathAch ~ SES | School
## .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## - attr(*, "labels")=List of 2
## ..$ y: chr "Mathematics Achievement score"
## ..$ x: chr "Socio-economic score"
## - attr(*, "FUN")=function (x)
## ..- attr(*, "source")= chr "function (x) max(x, na.rm = TRUE)"
## - attr(*, "order.groups")= logi TRUE
model <- lmer(MathAch ~ Sex + SES + (1|School) + (1|MEANSES), data = df)
summary(model)
## Linear mixed model fit by REML ['lmerMod']
## Formula: MathAch ~ Sex + SES + (1 | School) + (1 | MEANSES)
## Data: df
##
## REML criterion at convergence: 46595.7
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.1769 -0.7375 0.0342 0.7658 2.8048
##
## Random effects:
## Groups Name Variance Std.Dev.
## School (Intercept) 4.4722 2.1148
## MEANSES (Intercept) 0.0193 0.1389
## Residual 36.8132 6.0674
## Number of obs: 7185, groups: School, 160; MEANSES, 150
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 13.2769 0.2025 65.557
## Sex -1.1874 0.1654 -7.178
## SES 2.3564 0.1054 22.348
##
## Correlation of Fixed Effects:
## (Intr) Sex
## Sex -0.426
## SES -0.021 0.056